Differential expression using the bulk RNAseq JAX_RNAseq_Neuroectoderm data from the June 2024 release.

There is only one model system in this dataset.

There are samples from multiple days.

R libraries.

library(ggplot2)
library(ggrepel)
library(ggpubr)
library(stringr)
library(MASS)
library(RColorBrewer)
library(DESeq2)

library(viridis)
library(ggpointdensity)
library(dplyr)
library(data.table)
library(ComplexHeatmap)
library(UpSetR)
library(readxl)
theme_set(theme_classic())

path = "../../MorPhiC/data/June-2024/JAX_RNAseq_Neuroectoderm/"
dat_path = paste0(path, "processed/release110-gencode44/Tables")

Read in meta data

meta = fread("data/June_2024/JAX_RNAseq_Neuroectoderm_meta_data.tsv", 
             data.table = FALSE)
dim(meta)
## [1] 174  36
meta[1:2,]
##   clonal.label library_preparation.label                label
## 1  MOK20002-WT                GT23-05635 CBO-MOK20002-WT-Day7
## 2  MOK20002-WT                GT23-05625 CBO-MOK20002-WT-Day6
##                               description differentiated_product_protocol_id
## 1 KOLF2.2 derived cortical brain organoid                           JAXPD003
## 2 KOLF2.2 derived cortical brain organoid                           JAXPD003
##   model_system timepoint_value timepoint_unit final_timepoint
## 1          CBO               7           days              No
## 2          CBO               6           days              No
##   treatment_condition wt_control_status ko_strategy ko_gene
## 1      Not applicable                WT          WT      WT
## 2      Not applicable                WT          WT      WT
##   library_preparation.description
## 1                              NA
## 2                              NA
##   library_preparation.library_preparation_protocol_id
## 1                                            JAXPL001
## 2                                            JAXPL001
##   library_preparation.average_fragment_size
## 1                                       410
## 2                                       425
##   library_preparation.input_amount_value library_preparation.input_amount_unit
## 1                                    300                                    ng
## 2                                    300                                    ng
##   library_preparation.concentration_value
## 1                                    51.8
## 2                                    33.8
##   library_preparation.concentration_unit library_preparation.total_yield_value
## 1                                     nM                                 249.6
## 2                                     nM                                 170.0
##   library_preparation.total_yield_unit library_preparation.cdna_pcr_cycles
## 1                                   ng                                  10
## 2                                   ng                                  10
##   library_preparation.pcr_cycles_for_indexing
## 1                               Not available
## 2                               Not available
##                                             file_id                     run_id
## 1 RFX_WT_Day7_GT23-05635_GCACGGAC-TGCGAGAC_S55_L002 20230424_23-scbct-028-run3
## 2 RFX_WT_Day6_GT23-05625_GACCTGAA-CTCACCAA_S97_L002 20230424_23-scbct-028-run3
##   clonal.description clonal.parental_name clonal.clone_id clonal.type
## 1            KOLF2.2             KOLF2.2J              WT        iPSC
## 2            KOLF2.2             KOLF2.2J              WT        iPSC
##   clonal.zygosity clonal.cell_line_generation_protocol
## 1  Not applicable                       Not applicable
## 2  Not applicable                       Not applicable
##   clonal.treatment_condition clonal.wt_control_status
## 1             Not applicable                       WT
## 2             Not applicable                       WT
##   expression_alteration.label                                  model_organ
## 1              Not applicable Cortical brain (dorsal forebrain patterning)
## 2              Not applicable Cortical brain (dorsal forebrain patterning)
names(meta)
##  [1] "clonal.label"                                       
##  [2] "library_preparation.label"                          
##  [3] "label"                                              
##  [4] "description"                                        
##  [5] "differentiated_product_protocol_id"                 
##  [6] "model_system"                                       
##  [7] "timepoint_value"                                    
##  [8] "timepoint_unit"                                     
##  [9] "final_timepoint"                                    
## [10] "treatment_condition"                                
## [11] "wt_control_status"                                  
## [12] "ko_strategy"                                        
## [13] "ko_gene"                                            
## [14] "library_preparation.description"                    
## [15] "library_preparation.library_preparation_protocol_id"
## [16] "library_preparation.average_fragment_size"          
## [17] "library_preparation.input_amount_value"             
## [18] "library_preparation.input_amount_unit"              
## [19] "library_preparation.concentration_value"            
## [20] "library_preparation.concentration_unit"             
## [21] "library_preparation.total_yield_value"              
## [22] "library_preparation.total_yield_unit"               
## [23] "library_preparation.cdna_pcr_cycles"                
## [24] "library_preparation.pcr_cycles_for_indexing"        
## [25] "file_id"                                            
## [26] "run_id"                                             
## [27] "clonal.description"                                 
## [28] "clonal.parental_name"                               
## [29] "clonal.clone_id"                                    
## [30] "clonal.type"                                        
## [31] "clonal.zygosity"                                    
## [32] "clonal.cell_line_generation_protocol"               
## [33] "clonal.treatment_condition"                         
## [34] "clonal.wt_control_status"                           
## [35] "expression_alteration.label"                        
## [36] "model_organ"
table(table(meta$clonal.label))
## 
##  1  2  3  4 
## 36  5 16 20
table(table(meta$library_preparation.label))
## 
##   1 
## 174
table(table(meta$label))
## 
##   1 
## 174

Read in gene count data and filter genes

cts = fread(file.path(dat_path, "genesCounts.csv"), data.table = FALSE)
dim(cts)
## [1] 38592   175
cts[1:2, c(1:2, (ncol(cts)-1):ncol(cts))]
##              Name RFX_CE_F5_Day8_GT23-05641_GTCGGAGC-TTATAACC_S73_L002
## 1 ENSG00000268674                                                    0
## 2 ENSG00000271254                                                 1521
##   LHX5_CE_A1_Day3_GT23-07300_AAGTCCAA-TACTCATA_S163_L004
## 1                                                      0
## 2                                                   1270
##   LHX5_KO_E4_Day5_GT23-07318_AACGTTCC-AGTACTCC_S166_L004
## 1                                                      0
## 2                                                    746
stopifnot(setequal(names(cts)[-1], meta$file_id))

meta = meta[match(names(cts)[-1], meta$file_id),]
table(names(cts)[-1] == meta$file_id)
## 
## TRUE 
##  174
cts_dat = data.matrix(cts[,-1])
rownames(cts_dat) = cts$Name

unique_timepoints = unique(meta$timepoint_value)
unique_timepoints = sort(unique_timepoints)

xx = as.character(meta$timepoint_value)
xx = factor(xx, levels = as.character(unique_timepoints))
meta$timepoint = xx

table(meta$ko_gene, meta$timepoint)
##        
##          3  4  5  6  7  8  9 14
##   FEZF2  0  0  9  9  9  9  0  0
##   LHX5   9  9  0 18  0  0  0  0
##   LHX9   0  0  0  0  0  0  0  9
##   OTX1   0  0  0  0  9  9  9  0
##   PAX6   0  0  0  0  9  0  0  0
##   RFX4   0  0  0  9  9  9  9  0
##   WT     1  1  1  6  4  5  2  1

Read in gene annoation

gene_anno = fread("data/gencode_v44_primary_assembly_info.tsv")
dim(gene_anno)
## [1] 62754     8
gene_anno[1:2,]
##                geneId    chr strand     start       end ensembl_gene_id
##                <char> <char> <char>     <int>     <int>          <char>
## 1: ENSG00000000003.16   chrX      - 100627108 100639991 ENSG00000000003
## 2:  ENSG00000000005.6   chrX      + 100584936 100599885 ENSG00000000005
##    hgnc_symbol                                       description
##         <char>                                            <char>
## 1:      TSPAN6 tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858]
## 2:        TNMD   tenomodulin [Source:HGNC Symbol;Acc:HGNC:17757]
table(rownames(cts_dat) %in% gene_anno$ensembl_gene_id)
## 
##  TRUE 
## 38592
# all ensembl gene IDs are contained in the annotation
setdiff(rownames(cts_dat), gene_anno$ensembl_gene_id)
## character(0)

Discard genes whose expression is 0 in more than 75% of samples

model_s = meta$model_system
table(model_s, useNA="ifany")
## model_s
## CBO 
## 174
get_q75 <- function(x){quantile(x, probs = 0.75)}

gene_info = data.frame(Name = cts$Name, 
                       apply(cts_dat, 1, tapply, model_s, min), 
                       apply(cts_dat, 1, tapply, model_s, median), 
                       apply(cts_dat, 1, tapply, model_s, get_q75), 
                       apply(cts_dat, 1, tapply, model_s, max))

dim(gene_info)
## [1] 38592     5
gene_info[1:2, ]
##                            Name apply.cts_dat..1..tapply..model_s..min.
## ENSG00000268674 ENSG00000268674                                       0
## ENSG00000271254 ENSG00000271254                                     426
##                 apply.cts_dat..1..tapply..model_s..median.
## ENSG00000268674                                          0
## ENSG00000271254                                       1318
##                 apply.cts_dat..1..tapply..model_s..get_q75.
## ENSG00000268674                                        0.00
## ENSG00000271254                                     1701.75
##                 apply.cts_dat..1..tapply..model_s..max.
## ENSG00000268674                                       3
## ENSG00000271254                                    3117
table(row.names(gene_info)==gene_info$Name, useNA="ifany")
## 
##  TRUE 
## 38592
names(gene_info)[2:5] = paste0(rep(c("CBO_"), times=4), 
                               rep(c("min", "median", "q75", "max"), each=1))
dim(gene_info)
## [1] 38592     5
gene_info[1:2,]
##                            Name CBO_min CBO_median CBO_q75 CBO_max
## ENSG00000268674 ENSG00000268674       0          0    0.00       3
## ENSG00000271254 ENSG00000271254     426       1318 1701.75    3117
summary(gene_info[,-1])
##     CBO_min          CBO_median          CBO_q75            CBO_max       
##  Min.   :    0.0   Min.   :     0.0   Min.   :     0.0   Min.   :      0  
##  1st Qu.:    0.0   1st Qu.:     0.0   1st Qu.:     0.0   1st Qu.:      2  
##  Median :    0.0   Median :     3.0   Median :     5.8   Median :     22  
##  Mean   :  213.8   Mean   :   699.1   Mean   :   931.4   Mean   :   1829  
##  3rd Qu.:   89.0   3rd Qu.:   332.1   3rd Qu.:   474.6   3rd Qu.:   1003  
##  Max.   :88969.0   Max.   :489181.5   Max.   :586205.0   Max.   :1301990
table(gene_info$CBO_q75 > 0)
## 
## FALSE  TRUE 
## 14390 24202
w2kp = which(gene_info$CBO_q75 > 0)
cts_dat = cts_dat[w2kp,]
dim(cts_dat)
## [1] 24202   174
gene_info = gene_info[w2kp,]

meta$read_depth = colSums(cts_dat)/1e6
meta$q75 = apply(cts_dat, 2, quantile, probs = 0.75)

ggplot(meta, aes(x=read_depth, y=q75, color=ko_strategy)) +
    geom_point(size=2, alpha=0.7) + ggtitle("Gene counts") + 
  scale_shape_manual(values = c(7, 16)) + 
  xlab("read depth (million)") + ylab("75 percentile of gene expression") + 
  labs(color = "KO type") + 
  scale_color_brewer(palette = "Set1")

meta$run_id_short = str_replace(meta$run_id, "^[^-]*-", "")

ggplot(meta, aes(x=read_depth, y=q75, color=run_id_short)) +
    geom_point(size=2, alpha=0.7) + ggtitle("Gene counts") + 
  scale_shape_manual(values = c(7, 16)) + 
  xlab("read depth (million)") + ylab("75 percentile of gene expression") + 
  labs(color = "Run ID") + 
  scale_color_brewer(palette = "Set1")

Check the effect of run id among WT

sample2kp = which(meta$ko_gene == "WT")
cts_dat_m = cts_dat[,sample2kp]
meta_m    = meta[sample2kp,]
summary(meta_m$q75)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   424.0   654.0   876.0   880.7  1047.0  1276.0
dim(cts_dat_m)
## [1] 24202    21
stopifnot(all(meta_m$file_id == colnames(cts_dat_m)))

q75 = apply(cts_dat_m, 1, quantile, probs=0.75)
cts_dat_n = t(cts_dat_m[q75 > 0,])
cts_dat_n = log(median(meta_m$q75)*cts_dat_n/meta_m$q75 + 1)
dim(cts_dat_n)
## [1]    21 23745
sample_pca = prcomp(cts_dat_n, center = TRUE, scale. = TRUE)
names(sample_pca)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
pc_scores = sample_pca$x
eigen_vals = sample_pca$sdev^2
eigen_vals[1:5]/sum(eigen_vals)
## [1] 0.27258414 0.16356916 0.10631824 0.07678308 0.05052017
pvs = round(eigen_vals[1:5]/sum(eigen_vals)*100,1)
pvs
## [1] 27.3 16.4 10.6  7.7  5.1
PC_df = data.frame(pc_scores)
PC_df = cbind(PC_df, meta_m)

# there is only one level for model_system here
# no need to use model_system for the shape
gPC = ggplot(PC_df, aes(x = PC1, y = PC2, 
                        color=run_id_short)) +
  geom_point(size=2.5, alpha=0.7) + 
  xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) + 
  ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) + 
  ggtitle("Wild type samples") + 
  guides(
    color = guide_legend(title = NULL, order = 1),
    shape = guide_legend(title = NULL, order = 2)
  ) + 
  theme_classic() + 
  theme(
    legend.margin = margin(0, 0, 0, 0), 
    legend.box.just = "left", 
    legend.direction = "vertical" 
  )

print(gPC)

table(PC_df$run_id_short, PC_df$timepoint)
##                 
##                  3 4 5 6 7 8 9 14
##   robson-003     0 0 0 0 0 0 1  0
##   robson-004     0 0 0 3 0 0 0  0
##   robson-006     0 0 0 0 0 3 0  0
##   robson-020     0 0 0 0 1 1 1  0
##   scbct-008      0 0 0 0 1 0 0  0
##   scbct-028-run3 0 0 0 1 1 1 0  0
##   scbct-039-run2 0 0 1 1 1 0 0  0
##   scbct-052      1 1 0 1 0 0 0  0
##   scbct-092      0 0 0 0 0 0 0  1
n_shapes = length(unique(PC_df$run_id_short))
u_shapes = c(1, 7, 8, 9, 10, 11, 15, 16, 17)[1:n_shapes]

n_time_point = length(unique(PC_df$timepoint))

gPC_time = ggplot(PC_df, aes(x = PC1, y = PC2, 
                        color=timepoint,
                        shape=run_id_short)) +
  geom_point(size=2.5) + 
  scale_shape_manual(values = u_shapes) + 
  scale_color_brewer(palette = "Dark2") +
  xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) + 
  ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) + 
  ggtitle("Wild type samples") + 
  guides(
    color = guide_legend(title = NULL, order = 1),
    shape = guide_legend(title = NULL, order = 2)
  ) + 
  theme_classic() + 
  theme(
    legend.margin = margin(0, 0, 0, 0), 
    legend.box.just = "left", 
    legend.direction = "vertical" 
  )

print(gPC_time)

Analyze samples of different model systems

For the June 2024 release of JAX_RNAseq_Neuroectoderm, there is only one model system.

table(meta$run_id, meta$model_system, useNA="ifany")
##                             
##                              CBO
##   20230228_23-scbct-008       10
##   20230424_23-scbct-028-run3  30
##   20230508_23-scbct-039-run2  30
##   20230522_23-scbct-052       30
##   20230824_23-scbct-092       10
##   20240131_23-robson-020      30
##   20240307_24-robson-003      10
##   20240307_24-robson-004      12
##   20240307_24-robson-006      12
table(meta$run_id, meta$ko_gene, useNA="ifany")
##                             
##                              FEZF2 LHX5 LHX9 OTX1 PAX6 RFX4 WT
##   20230228_23-scbct-008          0    0    0    0    9    0  1
##   20230424_23-scbct-028-run3     0    0    0    0    0   27  3
##   20230508_23-scbct-039-run2    27    0    0    0    0    0  3
##   20230522_23-scbct-052          0   27    0    0    0    0  3
##   20230824_23-scbct-092          0    0    9    0    0    0  1
##   20240131_23-robson-020         0    0    0   27    0    0  3
##   20240307_24-robson-003         0    0    0    0    0    9  1
##   20240307_24-robson-004         0    9    0    0    0    0  3
##   20240307_24-robson-006         9    0    0    0    0    0  3
table(meta$ko_strategy, meta$ko_gene, useNA="ifany")
##      
##       FEZF2 LHX5 LHX9 OTX1 PAX6 RFX4 WT
##   CE     12   12    3    9    3   12  0
##   KO     12   12    3    9    3   12  0
##   PTC    12   12    3    9    3   12  0
##   WT      0    0    0    0    0    0 21
df1 = as.data.frame(table(meta$timepoint, meta$ko_gene))
names(df1)[1:2] = c("time", "gene")
df1$Freq[df1$gene != "WT"] = df1$Freq[df1$gene != "WT"]/3

ggplot(df1, aes(x = time, y = gene, fill = Freq)) + 
  geom_tile(color = "black", linewidth = 0.3) + 
  scale_fill_gradient(low = "white", high = "red") +
  theme_minimal() + 
  geom_text(aes(label = ifelse(gene == "WT", Freq, "")), 
            color = "black", size = 3) + 
  labs(x = "Time", y = "Gene")

Explore the PC plots first, to decide running the models on what level of DE group.

Then run the DE analysis.

for(model1 in unique(meta$model_system)){
  print(model1)
  sample2kp = which(meta$model_system == model1)
  cts_dat_m = cts_dat[,sample2kp]
  meta_m    = meta[sample2kp,]
 
  table(meta_m$ko_strategy)
  stopifnot(all(meta_m$file_id == colnames(cts_dat_m)))
  
  # verify the consistency between ko_strategy from metadata column and 
  # that from filenames
  extracted_ko_strings = str_extract_all(meta_m$file_id, "CE|KO|PTC|WT")
  print(table(sapply(extracted_ko_strings, length)))
  print(table(meta_m$ko_strategy, unlist(extracted_ko_strings)))

  q75 = apply(cts_dat_m, 1, quantile, probs=0.75)

  cts_dat_n = t(cts_dat_m[q75 > 0,])
  cts_dat_n = log(median(meta_m$q75)*cts_dat_n/meta_m$q75 + 1)
  dim(cts_dat_n)
  
  sample_pca = prcomp(cts_dat_n, center = TRUE, scale. = TRUE)
  names(sample_pca)
  pc_scores = sample_pca$x
  eigen_vals = sample_pca$sdev^2
  eigen_vals[1:5]/sum(eigen_vals)
  pvs = round(eigen_vals[1:5]/sum(eigen_vals)*100,1)
  pvs
  
  PC_df = data.frame(pc_scores)
  PC_df = cbind(PC_df, meta_m)
  
  gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=ko_strategy, color=ko_gene)) +
    geom_point(size=2.5, alpha=0.7) + scale_color_brewer(palette = "Dark2") +
      scale_shape_manual(values = c(7, 15, 16, 17)) + 
    xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) + 
    ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) + 
    ggtitle(unique(model_s)) + 
    guides(
      color = guide_legend(title = NULL, order = 1),
      shape = guide_legend(title = NULL, order = 2)
    ) + 
    theme_classic() + 
    theme(
      legend.margin = margin(0, 0, 0, 0), 
      legend.box.just = "left", 
      legend.direction = "vertical" 
    )

  print(gPC)
  
  # too many levels for shape, need to manually specify it
  print(length(unique(PC_df$ko_gene)))
  print(length(unique(PC_df$run_id_short)))
  
  gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=run_id_short, color=ko_gene)) +
    geom_point(size=2.5, alpha=0.7) + 
    scale_color_brewer(palette = "Dark2") +
    scale_shape_manual(values = c(1, 7, 8, 9, 10, 11, 15, 16, 17)) + 
    xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) + 
    ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) + 
    ggtitle(unique(model_s)) + 
    guides(
      color = guide_legend(title = NULL, order = 1),
      shape = guide_legend(title = NULL, order = 2)
    ) +
    theme_classic() + 
    theme(
      legend.margin = margin(0, 0, 0, 0), 
      legend.box.just = "left", 
      legend.direction = "vertical" 
    )

  print(gPC)
  
  print(table(meta_m$run_id_short, meta_m$ko_gene))
  
  colnames(PC_df)[which(colnames(PC_df)=="timepoint")] = "day"
  
  gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=ko_gene, color=day)) +
    geom_point(size=2.5, alpha=0.7) + 
    scale_color_brewer(palette = "Dark2") +
    scale_shape_manual(values = c(1, 7, 8, 9, 10, 11, 15, 16, 17)[1:length(unique(PC_df$ko_gene))]) + 
    xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) + 
    ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) + 
    ggtitle(unique(model_s)) + 
    guides(
      color = guide_legend(title = NULL, order = 1),
      shape = guide_legend(title = NULL, order = 2)
    ) +
    theme_classic() + 
    theme(
      legend.margin = margin(0, 0, 0, 0), 
      legend.box.just = "left", 
      legend.direction = "vertical" 
    )

  print(gPC)
  
  gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=ko_gene, color=day, 
                          alpha=ifelse(ko_gene == "WT", 1, 0.6))) +
    geom_point(size=2.5) + 
    scale_color_brewer(palette = "Dark2") +
    scale_shape_manual(values = c(1, 7, 8, 9, 10, 11, 15, 16, 17)[1:length(unique(PC_df$ko_gene))]) + 
    xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) + 
    ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) + 
    ggtitle(unique(model_s)) + 
    guides(
      color = guide_legend(title = NULL, order = 1),
      shape = guide_legend(title = NULL, order = 2), 
      alpha = "none"
    ) +
    theme_classic() + 
    theme(
      legend.margin = margin(0, 0, 0, 0), 
      legend.box.just = "left", 
      legend.direction = "vertical" 
    )

  print(gPC)
  
}
## [1] "CBO"
## 
##   1 
## 174 
##      
##       CE KO PTC WT
##   CE  51  0   0  0
##   KO   0 51   0  0
##   PTC  0  0  51  0
##   WT   0  0   0 21

## [1] 7
## [1] 9

##                 
##                  FEZF2 LHX5 LHX9 OTX1 PAX6 RFX4 WT
##   robson-003         0    0    0    0    0    9  1
##   robson-004         0    9    0    0    0    0  3
##   robson-006         9    0    0    0    0    0  3
##   robson-020         0    0    0   27    0    0  3
##   scbct-008          0    0    0    0    9    0  1
##   scbct-028-run3     0    0    0    0    0   27  3
##   scbct-039-run2    27    0    0    0    0    0  3
##   scbct-052          0   27    0    0    0    0  3
##   scbct-092          0    0    9    0    0    0  1

Use day to construct DE groups.

Run DESeq2 for different day groups separately

day 3 + day 4, with a day indictor
day 5 (do not run for day 5, since there is only one WT)
day 6
day 7
day 8 (exclude samples from run ID short scbct-028-run3)
day 9 (do not include day 14 in this setting)

For the output files, create two versions, one with direct information, one with padj set to NA for the genes when the number of 0 per test is >=4.

In more details, for both versions of the outputs, make it one file per knockout gene per DE group (DE group can be model system, model system + day + condition, model system + day + run_id, model system + day, etc.).

The first version of the output files contains

gene id, gene symbol

and for each knockout strategy:

baseMean, log2FoldChange, lfcSE, stat, pvalue, padj

The second version of the output files contains

gene id, gene symbol, chr, strand, position

and for each knockout strategy:

log2FoldChange, pvalue, padj (set to be NA if the number of 0 per test >= 4)

In addition, save out a separate file of the sample size for each comparison. This needs to be by DE group.

Prepare output directories for DE results

df_size = NULL

release_name = "2024_06_JAX_RNAseq_Neuroectoderm"

output_dir = file.path("results", release_name)
raw_output_dir = file.path(output_dir, "raw")
processed_output_dir = file.path(output_dir, "processed")

if (!file.exists(raw_output_dir)){
  dir.create(raw_output_dir, recursive = TRUE)
}

if (!file.exists(processed_output_dir)){
  dir.create(processed_output_dir, recursive = TRUE)
}

DE analysis for all genes

for(model1 in unique(meta$model_system)){

  print(model1)
  sample2kp = which(meta$model_system == model1)
  cts_dat_m = cts_dat[,sample2kp]
  meta_m    = meta[sample2kp,]
 
  table(meta_m$ko_strategy)
  stopifnot(all(meta_m$file_id == colnames(cts_dat_m)))
  
  ## Generate sample information matrix
  
  cols2kp = c("model_system", "ko_strategy", "ko_gene", "run_id", "run_id_short", 
              "timepoint", "q75")
  sample_info = meta_m[,cols2kp]
  colnames(sample_info)[which(colnames(sample_info)=="timepoint")] = "day"
  
  dim(sample_info)
  sample_info[1:2,]
  
  table(sample_info$ko_strategy, sample_info$ko_gene)
  
  sample_info$ko_group = paste(sample_info$ko_gene, 
                               sample_info$ko_strategy, sep="_")
  print(table(sample_info$ko_group))
  print(length(unique(sample_info$ko_group)))
  print(table(sample_info$run_id, sample_info$ko_group))
  
  print(table(sample_info$ko_group, sample_info$day))
  
  sample_info$ko_group_run_id = paste(sample_info$ko_group, 
                                      sample_info$run_id_short, sep="_")

  print(table(sample_info$ko_gene, sample_info$day)) 
  print(table(sample_info$ko_group_run_id, sample_info$day)) 
  
  # print the table of ko_group and run_id by day
  for (cur_day in as.character(unique_timepoints)){
    sample_info_by_day = sample_info[which(sample_info$day==cur_day), ]
    print("---------------------------")
    print(paste0("On day ", as.character(cur_day), ": "))
    print(table(sample_info_by_day$ko_group, sample_info_by_day$run_id_short))     
  }
  
  #sample_info$log_q75 = log(sample_info$q75)
  
  
  # run DESeq2 for different day groups separately
  # day 3 + day 4
  # day 6
  # day 7
  # day 8 (not excluding RFX4 for now)
  # day 9 (do not include day 14 in this setting)
  # do not include run_id in the model if there is only one run_id in certain day
  
  sample_info$day_group = as.character(sample_info$day)
  sample_info$day_group[which(sample_info$day_group%in%c("3", "4"))] = "3_4"
  table(sample_info$day_group)
  
  sample_info$de_grp = paste0(sample_info$model_system, "_day_", sample_info$day_group)

  unique_de_grps = setdiff(unique(sample_info$de_grp), 
                           c(paste0(model1, "_day_5"), paste0(model1, "_day_14")))
  sorted_de_grps = sort(unique_de_grps)
  
  for (d_group in sorted_de_grps){
    
    print("===================================")
    print(sprintf("day group %s DESeq2 results:", d_group))
    print("===================================")  
    
    dg_index = which(sample_info$de_grp==d_group)
    cts_dat_m_dg = cts_dat_m[, dg_index]
    sample_info_dg = sample_info[dg_index, ]
    
    # additional step for DE group CBO_day_8, to exclude samples from run id short scbct-028-run3
    if (d_group == "CBO_day_8"){
      
      kept_index = which(sample_info_dg$run_id_short!="scbct-028-run3")
      cts_dat_m_dg = cts_dat_m_dg[, kept_index]
      sample_info_dg = sample_info_dg[kept_index, ]
      
      stopifnot(all(sample_info_dg$run_id_short!="scbct-028-run3"))
      
    }
      
    stopifnot(all(sample_info_dg$de_grp==d_group))
    
    # do two steps of filtering
    # first, filter based on q75 of each gene
    # second, filter based on whether each gene is expressed in at least 4 cells
    
    dg_q75 = apply(cts_dat_m_dg, 1, quantile, probs=0.75)
    cts_dat_m_dg = cts_dat_m_dg[dg_q75 > 0,]
    
    print(sprintf("After filtering by gene expression q75, the number of genes reduces from %d to %d", 
                  length(dg_q75), nrow(cts_dat_m_dg)))

    n_pos = rowSums(cts_dat_m_dg>0)
    cts_dat_m_dg = cts_dat_m_dg[n_pos >= 4,]

    if (length(n_pos)==nrow(cts_dat_m_dg)){
      print("After filtering by nonzero expression in at least 4 samples, the number of genes does not change.")
    }else{
      print(sprintf("After filtering by nonzero expression in at least 4 samples, the number of genes reduces from %d to %d", 
                    length(n_pos), nrow(cts_dat_m_dg)))    
    }
    
    # update the q75 based on only genes that will be used in the DE analysis
    sample_info_dg$q75 = apply(cts_dat_m_dg, 2, quantile, probs = 0.75)
    sample_info_dg$log_q75 = log(sample_info_dg$q75)
    
    # add day ID as another covariate for day group 3_4
    # but cannot do that for day group 9_14,
    # since day 14 completely overlaps with run ID scbct-092
    # for day 9 and day 14, only run analysis on day 9 samples
    
    # since day 3_4 only has one run id
    # write out the setting for day 3_4 specifically
    
    if (d_group == "CBO_day_3_4"){
      dds = DESeqDataSetFromMatrix(cts_dat_m_dg, colData=sample_info_dg, 
                                    ~ ko_group + day + log_q75)      
    }else if (length(unique(sample_info_dg$run_id))==1){
      dds = DESeqDataSetFromMatrix(cts_dat_m_dg, colData=sample_info_dg, 
                                    ~ ko_group + log_q75)
    }else{
      dds = DESeqDataSetFromMatrix(cts_dat_m_dg, colData=sample_info_dg, 
                                    ~ ko_group + run_id + log_q75)     
    }
  
    start.time <- Sys.time()
    dds = DESeq(dds)
    end.time <- Sys.time()
      
    time.taken <- end.time - start.time
    print(time.taken)
    
    ## association with read-depth
    res_rd = results(dds, name = "log_q75")
    res_rd = as.data.frame(res_rd)
    dim(res_rd)
    res_rd[1:2,]
    
    table(is.na(res_rd$padj))
  
    g0 = ggplot(res_rd %>% subset(!is.na(padj)), aes(x=pvalue)) + 
      geom_histogram(color="darkblue", fill="lightblue", 
                     breaks = seq(0,1,by=0.01)) + 
      ggtitle(paste0(d_group, " read depth"))
    print(g0)
  
    ## Rerun the analysis without read-depth if it is not significant for 
    ## most genes, and the number of samples is small
    ## i.e., proportion of non-null in the distribution is smaller than 0.01
    ## (this needs to be restricted to the genes with not NA adjusted pvalue)
    ## or if smaller than 0.1 and the number of samples is not greater than 6

    pi_1_rd = max(0, mean(res_rd$pvalue[!is.na(res_rd$padj)] < 0.05) - 0.05)
    pi_1_rd = max(pi_1_rd, 0, 1 - 2*mean(res_rd$pvalue[!is.na(res_rd$padj)] > 0.5))
    
    print(sprintf("prop of non-null for rd: %.2e", pi_1_rd))
    
    if(pi_1_rd < 0.01 || (ncol(cts_dat_m_dg) <= 6 && pi_1_rd < 0.1 )){
  
      print("rerun DESeq2 without read depth")
      
      include_rd = 0
      
      if (d_group == "CBO_day_3_4"){
        dds = DESeqDataSetFromMatrix(cts_dat_m_dg, colData=sample_info_dg, 
                                      ~ ko_group + day)      
      }else if (length(unique(sample_info_dg$run_id))==1){
        dds = DESeqDataSetFromMatrix(cts_dat_m_dg, colData=sample_info_dg, 
                                      ~ ko_group)
      }else{
        dds = DESeqDataSetFromMatrix(cts_dat_m_dg, colData=sample_info_dg, 
                                      ~ ko_group + run_id)     
      }
      
      dds = DESeq(dds)
      
    }else{
      include_rd = 1
    }

    ## association with run_id
    
    if (length(unique(sample_info_dg$run_id))>1){
      
      # de_group 3_4 only has one run id, this part is not relevant for it
      if(include_rd)
        dds_lrt = DESeq(dds, test="LRT", reduced = ~ ko_group + log_q75)
      else
        dds_lrt = DESeq(dds, test="LRT", reduced = ~ ko_group)

      res_lrt = results(dds_lrt)
    
      res_run_id = as.data.frame(res_lrt)
      dim(res_run_id)
      res_run_id[1:2,]
    
      table(is.na(res_run_id$padj))
      
      g0 = ggplot(res_run_id %>% subset(!is.na(padj)), aes(x=pvalue)) + 
        geom_histogram(color="darkblue", fill="lightblue", 
                       breaks = seq(0,1,by=0.01)) + 
        ggtitle(paste0(d_group, " Run ID"))
      print(g0)
    
    }
    
    ## DE analysis for each KO gene
    table(sample_info_dg$ko_gene)
    table(sample_info_dg$ko_gene, sample_info_dg$ko_strategy)
    
    genos = setdiff(unique(sample_info_dg$ko_gene), "WT")
    genos
    
    ko_grp  = c("CE", "KO", "PTC")
    
    for(geno in genos){
      
      res_geno_df = NULL
      res_geno_reduced_df = NULL
      res_geno = list()
      
      for(k_grp1 in ko_grp){
        
        res_g1 = results(dds, contrast = c("ko_group", 
                                           paste0(geno, "_", k_grp1), "WT_WT"))
        res_g1 = signif(data.frame(res_g1), 3)
        
        # add a column to record the number of zeros per test
        test_index = which(sample_info_dg$ko_group%in%c(paste0(geno, "_", k_grp1), "WT_WT"))
        
        cts_dat_m_dg_test = cts_dat_m_dg[, test_index]
        n_zero = rowSums(cts_dat_m_dg_test==0)
        res_g1$n_zeros = n_zero        
        
        # record the number of samples involved in the comparison
        test_ko_group_vec = sample_info_dg$ko_group[test_index]
        
        if (d_group=="CBO_day_8"){
          d_group_for_size = paste0(d_group, "_excluding_20230424_23-scbct-028-run3")
        }else{
          d_group_for_size = d_group
        }
        df_size = rbind(df_size, 
                        c(d_group_for_size, 
                          paste0(geno, "_", k_grp1), 
                          sum(test_ko_group_vec!="WT_WT"), 
                          sum(test_ko_group_vec=="WT_WT")))       
        
        # prepare a processed version of output
        res_g1_reduced = res_g1[, c("log2FoldChange", "pvalue", "padj", "n_zeros")]
        res_g1_reduced$padj[which(res_g1_reduced$n_zeros>=4)] = NA
        
        if ((sum(test_ko_group_vec!="WT_WT")==1)|(sum(test_ko_group_vec=="WT_WT")==1)){
          res_g1_reduced$padj = NA
        }
        
        res_g1_reduced = res_g1_reduced[, c("log2FoldChange", "pvalue", "padj")]
        
        res_geno[[k_grp1]] = res_g1_reduced     
        
        g1 = ggplot(res_g1_reduced %>% subset(!is.na(padj)), aes(x=pvalue)) + 
             geom_histogram(color="darkblue", fill="lightblue", 
                            breaks = seq(0,1,by=0.01)) + 
             ggtitle(paste0(d_group, " ", geno, "_", k_grp1))
        print(g1)

  
        tag1 = sprintf("%s_%s_", geno, k_grp1)
        colnames(res_g1) = paste0(tag1, colnames(res_g1))
        colnames(res_g1_reduced) = paste0(tag1, colnames(res_g1_reduced))
        
        if(is.null(res_geno_df)){
          res_geno_df = res_g1
        }else if(!is.null(res_geno_df)){
          stopifnot(all(rownames(res_geno_df) == rownames(res_g1)))
          res_geno_df = cbind(res_geno_df, res_g1)
        }
        
        if(is.null(res_geno_reduced_df)){
          res_geno_reduced_df = res_g1_reduced
        }else if(!is.null(res_geno_reduced_df)){
          stopifnot(all(rownames(res_geno_reduced_df) == rownames(res_g1_reduced)))
          res_geno_reduced_df = cbind(res_geno_reduced_df, res_g1_reduced)
        }        
        
      }
      
      get_index <- function(x){
        which(x$padj < 0.05 & abs(x$log2FoldChange) > log2(1.5))
      }
      
      w_ce = get_index(res_geno[["CE"]])
      w_ko = get_index(res_geno[["KO"]])
      w_ptc = get_index(res_geno[["PTC"]])
  
      print(paste0("DE group ", d_group, " under KO gene ", geno, ":"))
      
      print(paste0("number of DE genes from knock out strategy CE: ", 
                   as.character(length(w_ce))))
      print(paste0("number of DE genes from knock out strategy KO: ", 
                   as.character(length(w_ko))))
      print(paste0("number of DE genes from knock out strategy PTC: ", 
                   as.character(length(w_ptc))))
      
      ce_ko_overlap = length(intersect(rownames(res_geno[["CE"]])[w_ce], 
                                       rownames(res_geno[["KO"]])[w_ko]))
      
      ko_ptc_overlap = length(intersect(rownames(res_geno[["KO"]])[w_ko], 
                                        rownames(res_geno[["PTC"]])[w_ptc]))
      
      ptc_ce_overlap = length(intersect(rownames(res_geno[["PTC"]])[w_ptc], 
                                        rownames(res_geno[["CE"]])[w_ce]))
      
      print(paste0("number of common DE genes by knock out strategies CE and KO: ", 
                   as.character(ce_ko_overlap)))
      print(paste0("number of common DE genes by knock out strategies KO and PTC: ", 
                   as.character(ko_ptc_overlap)))
      print(paste0("number of common DE genes by knock out strategies PTC and CE: ", 
                   as.character(ptc_ce_overlap)))
            
      if (max(length(w_ce), length(w_ko), length(w_ptc)) > 0){
        m = make_comb_mat(list("CE" = rownames(res_geno[["CE"]])[w_ce], 
                               "KO" = rownames(res_geno[["KO"]])[w_ko], 
                               "PTC" = rownames(res_geno[["PTC"]])[w_ptc]))
        g_up = UpSet(m)
        print(g_up)
      }
      
      df1 = data.frame(padj_CE = res_geno[["CE"]]$padj, 
                       padj_KO = res_geno[["KO"]]$padj, 
                       padj_PTC = res_geno[["PTC"]]$padj)
  
      cr1 = cor(df1$padj_PTC, df1$padj_CE, method = "spearman", use="pair")
      cr2 = cor(df1$padj_PTC, df1$padj_KO, method = "spearman", use="pair")
      
      g4 = ggplot(data = df1, mapping = aes(x = -log10(padj_PTC), 
                                            y = -log10(padj_CE))) +
           geom_pointdensity() + 
           ggtitle(sprintf("%s, %s\nSpearman r = %.2f", d_group, geno, cr1)) + 
           scale_color_viridis()
          
      g5 = ggplot(data = df1, mapping = aes(x = -log10(padj_PTC), 
                                            y = -log10(padj_KO))) +
           geom_pointdensity() + 
           ggtitle(sprintf("%s, %s\nSpearman r = %.2f", d_group, geno, cr2)) + 
           scale_color_viridis()
          
      print(g4)
      print(g5)
  
  
      dim(res_geno_df)
      res_geno_df[1:2,]
      
      dim(res_geno_reduced_df)
      res_geno_reduced_df[1:2,]
      
      res_df = data.frame(gene_ID=rownames(res_geno_df), 
                          res_geno_df)
      dim(res_df)
      res_df[1:2,]
  
      res_reduced_gene_anno = gene_anno[match(rownames(res_geno_reduced_df), 
                                              gene_anno$ensembl_gene_id), ]
      stopifnot(all(rownames(res_geno_reduced_df)==res_reduced_gene_anno$ensembl_gene_id))
  
      # set gene hgnc_symbol that equal "" to NA
      hgnc_symbol_vec = res_reduced_gene_anno$hgnc_symbol
      hgnc_symbol_vec[which(hgnc_symbol_vec=="")] = NA
        
      res_reduced_df = data.frame(gene_ID=rownames(res_geno_reduced_df), 
                                  hgnc_symbol=hgnc_symbol_vec,
                                  chr=res_reduced_gene_anno$chr, 
                                  strand=res_reduced_gene_anno$strand, 
                                  start=res_reduced_gene_anno$start, 
                                  end=res_reduced_gene_anno$end, 
                                  res_geno_reduced_df)
      
      print("hgnc_symbol empty string and NA tables:")
      print(table(res_reduced_df$hgnc_symbol=="", useNA="ifany"))
      print(table(is.na(res_reduced_df$hgnc_symbol)))
        
      dim(res_reduced_df)
      res_reduced_df[1:2,]
    
      fwrite(res_df, 
             sprintf("%s/%s_%s_%s_DEseq2_raw.tsv", 
                     raw_output_dir, release_name, d_group, geno), 
             sep="\t")
      fwrite(res_reduced_df, 
             sprintf("%s/%s_%s_%s_DEseq2.tsv", 
                     processed_output_dir, release_name, d_group, geno), 
             sep="\t")
    
    }
    
  }
  
}
## [1] "CBO"
## 
##  FEZF2_CE  FEZF2_KO FEZF2_PTC   LHX5_CE   LHX5_KO  LHX5_PTC   LHX9_CE   LHX9_KO 
##        12        12        12        12        12        12         3         3 
##  LHX9_PTC   OTX1_CE   OTX1_KO  OTX1_PTC   PAX6_CE   PAX6_KO  PAX6_PTC   RFX4_CE 
##         3         9         9         9         3         3         3        12 
##   RFX4_KO  RFX4_PTC     WT_WT 
##        12        12        21 
## [1] 19
##                             
##                              FEZF2_CE FEZF2_KO FEZF2_PTC LHX5_CE LHX5_KO
##   20230228_23-scbct-008             0        0         0       0       0
##   20230424_23-scbct-028-run3        0        0         0       0       0
##   20230508_23-scbct-039-run2        9        9         9       0       0
##   20230522_23-scbct-052             0        0         0       9       9
##   20230824_23-scbct-092             0        0         0       0       0
##   20240131_23-robson-020            0        0         0       0       0
##   20240307_24-robson-003            0        0         0       0       0
##   20240307_24-robson-004            0        0         0       3       3
##   20240307_24-robson-006            3        3         3       0       0
##                             
##                              LHX5_PTC LHX9_CE LHX9_KO LHX9_PTC OTX1_CE OTX1_KO
##   20230228_23-scbct-008             0       0       0        0       0       0
##   20230424_23-scbct-028-run3        0       0       0        0       0       0
##   20230508_23-scbct-039-run2        0       0       0        0       0       0
##   20230522_23-scbct-052             9       0       0        0       0       0
##   20230824_23-scbct-092             0       3       3        3       0       0
##   20240131_23-robson-020            0       0       0        0       9       9
##   20240307_24-robson-003            0       0       0        0       0       0
##   20240307_24-robson-004            3       0       0        0       0       0
##   20240307_24-robson-006            0       0       0        0       0       0
##                             
##                              OTX1_PTC PAX6_CE PAX6_KO PAX6_PTC RFX4_CE RFX4_KO
##   20230228_23-scbct-008             0       3       3        3       0       0
##   20230424_23-scbct-028-run3        0       0       0        0       9       9
##   20230508_23-scbct-039-run2        0       0       0        0       0       0
##   20230522_23-scbct-052             0       0       0        0       0       0
##   20230824_23-scbct-092             0       0       0        0       0       0
##   20240131_23-robson-020            9       0       0        0       0       0
##   20240307_24-robson-003            0       0       0        0       3       3
##   20240307_24-robson-004            0       0       0        0       0       0
##   20240307_24-robson-006            0       0       0        0       0       0
##                             
##                              RFX4_PTC WT_WT
##   20230228_23-scbct-008             0     1
##   20230424_23-scbct-028-run3        9     3
##   20230508_23-scbct-039-run2        0     3
##   20230522_23-scbct-052             0     3
##   20230824_23-scbct-092             0     1
##   20240131_23-robson-020            0     3
##   20240307_24-robson-003            3     1
##   20240307_24-robson-004            0     3
##   20240307_24-robson-006            0     3
##            
##             3 4 5 6 7 8 9 14
##   FEZF2_CE  0 0 3 3 3 3 0  0
##   FEZF2_KO  0 0 3 3 3 3 0  0
##   FEZF2_PTC 0 0 3 3 3 3 0  0
##   LHX5_CE   3 3 0 6 0 0 0  0
##   LHX5_KO   3 3 0 6 0 0 0  0
##   LHX5_PTC  3 3 0 6 0 0 0  0
##   LHX9_CE   0 0 0 0 0 0 0  3
##   LHX9_KO   0 0 0 0 0 0 0  3
##   LHX9_PTC  0 0 0 0 0 0 0  3
##   OTX1_CE   0 0 0 0 3 3 3  0
##   OTX1_KO   0 0 0 0 3 3 3  0
##   OTX1_PTC  0 0 0 0 3 3 3  0
##   PAX6_CE   0 0 0 0 3 0 0  0
##   PAX6_KO   0 0 0 0 3 0 0  0
##   PAX6_PTC  0 0 0 0 3 0 0  0
##   RFX4_CE   0 0 0 3 3 3 3  0
##   RFX4_KO   0 0 0 3 3 3 3  0
##   RFX4_PTC  0 0 0 3 3 3 3  0
##   WT_WT     1 1 1 6 4 5 2  1
##        
##          3  4  5  6  7  8  9 14
##   FEZF2  0  0  9  9  9  9  0  0
##   LHX5   9  9  0 18  0  0  0  0
##   LHX9   0  0  0  0  0  0  0  9
##   OTX1   0  0  0  0  9  9  9  0
##   PAX6   0  0  0  0  9  0  0  0
##   RFX4   0  0  0  9  9  9  9  0
##   WT     1  1  1  6  4  5  2  1
##                           
##                            3 4 5 6 7 8 9 14
##   FEZF2_CE_robson-006      0 0 0 0 0 3 0  0
##   FEZF2_CE_scbct-039-run2  0 0 3 3 3 0 0  0
##   FEZF2_KO_robson-006      0 0 0 0 0 3 0  0
##   FEZF2_KO_scbct-039-run2  0 0 3 3 3 0 0  0
##   FEZF2_PTC_robson-006     0 0 0 0 0 3 0  0
##   FEZF2_PTC_scbct-039-run2 0 0 3 3 3 0 0  0
##   LHX5_CE_robson-004       0 0 0 3 0 0 0  0
##   LHX5_CE_scbct-052        3 3 0 3 0 0 0  0
##   LHX5_KO_robson-004       0 0 0 3 0 0 0  0
##   LHX5_KO_scbct-052        3 3 0 3 0 0 0  0
##   LHX5_PTC_robson-004      0 0 0 3 0 0 0  0
##   LHX5_PTC_scbct-052       3 3 0 3 0 0 0  0
##   LHX9_CE_scbct-092        0 0 0 0 0 0 0  3
##   LHX9_KO_scbct-092        0 0 0 0 0 0 0  3
##   LHX9_PTC_scbct-092       0 0 0 0 0 0 0  3
##   OTX1_CE_robson-020       0 0 0 0 3 3 3  0
##   OTX1_KO_robson-020       0 0 0 0 3 3 3  0
##   OTX1_PTC_robson-020      0 0 0 0 3 3 3  0
##   PAX6_CE_scbct-008        0 0 0 0 3 0 0  0
##   PAX6_KO_scbct-008        0 0 0 0 3 0 0  0
##   PAX6_PTC_scbct-008       0 0 0 0 3 0 0  0
##   RFX4_CE_robson-003       0 0 0 0 0 0 3  0
##   RFX4_CE_scbct-028-run3   0 0 0 3 3 3 0  0
##   RFX4_KO_robson-003       0 0 0 0 0 0 3  0
##   RFX4_KO_scbct-028-run3   0 0 0 3 3 3 0  0
##   RFX4_PTC_robson-003      0 0 0 0 0 0 3  0
##   RFX4_PTC_scbct-028-run3  0 0 0 3 3 3 0  0
##   WT_WT_robson-003         0 0 0 0 0 0 1  0
##   WT_WT_robson-004         0 0 0 3 0 0 0  0
##   WT_WT_robson-006         0 0 0 0 0 3 0  0
##   WT_WT_robson-020         0 0 0 0 1 1 1  0
##   WT_WT_scbct-008          0 0 0 0 1 0 0  0
##   WT_WT_scbct-028-run3     0 0 0 1 1 1 0  0
##   WT_WT_scbct-039-run2     0 0 1 1 1 0 0  0
##   WT_WT_scbct-052          1 1 0 1 0 0 0  0
##   WT_WT_scbct-092          0 0 0 0 0 0 0  1
## [1] "---------------------------"
## [1] "On day 3: "
##           
##            scbct-052
##   LHX5_CE          3
##   LHX5_KO          3
##   LHX5_PTC         3
##   WT_WT            1
## [1] "---------------------------"
## [1] "On day 4: "
##           
##            scbct-052
##   LHX5_CE          3
##   LHX5_KO          3
##   LHX5_PTC         3
##   WT_WT            1
## [1] "---------------------------"
## [1] "On day 5: "
##            
##             scbct-039-run2
##   FEZF2_CE               3
##   FEZF2_KO               3
##   FEZF2_PTC              3
##   WT_WT                  1
## [1] "---------------------------"
## [1] "On day 6: "
##            
##             robson-004 scbct-028-run3 scbct-039-run2 scbct-052
##   FEZF2_CE           0              0              3         0
##   FEZF2_KO           0              0              3         0
##   FEZF2_PTC          0              0              3         0
##   LHX5_CE            3              0              0         3
##   LHX5_KO            3              0              0         3
##   LHX5_PTC           3              0              0         3
##   RFX4_CE            0              3              0         0
##   RFX4_KO            0              3              0         0
##   RFX4_PTC           0              3              0         0
##   WT_WT              3              1              1         1
## [1] "---------------------------"
## [1] "On day 7: "
##            
##             robson-020 scbct-008 scbct-028-run3 scbct-039-run2
##   FEZF2_CE           0         0              0              3
##   FEZF2_KO           0         0              0              3
##   FEZF2_PTC          0         0              0              3
##   OTX1_CE            3         0              0              0
##   OTX1_KO            3         0              0              0
##   OTX1_PTC           3         0              0              0
##   PAX6_CE            0         3              0              0
##   PAX6_KO            0         3              0              0
##   PAX6_PTC           0         3              0              0
##   RFX4_CE            0         0              3              0
##   RFX4_KO            0         0              3              0
##   RFX4_PTC           0         0              3              0
##   WT_WT              1         1              1              1
## [1] "---------------------------"
## [1] "On day 8: "
##            
##             robson-006 robson-020 scbct-028-run3
##   FEZF2_CE           3          0              0
##   FEZF2_KO           3          0              0
##   FEZF2_PTC          3          0              0
##   OTX1_CE            0          3              0
##   OTX1_KO            0          3              0
##   OTX1_PTC           0          3              0
##   RFX4_CE            0          0              3
##   RFX4_KO            0          0              3
##   RFX4_PTC           0          0              3
##   WT_WT              3          1              1
## [1] "---------------------------"
## [1] "On day 9: "
##           
##            robson-003 robson-020
##   OTX1_CE           0          3
##   OTX1_KO           0          3
##   OTX1_PTC          0          3
##   RFX4_CE           3          0
##   RFX4_KO           3          0
##   RFX4_PTC          3          0
##   WT_WT             1          1
## [1] "---------------------------"
## [1] "On day 14: "
##           
##            scbct-092
##   LHX9_CE          3
##   LHX9_KO          3
##   LHX9_PTC         3
##   WT_WT            1
## [1] "==================================="
## [1] "day group CBO_day_3_4 DESeq2 results:"
## [1] "==================================="
## [1] "After filtering by gene expression q75, the number of genes reduces from 24202 to 23242"
## [1] "After filtering by nonzero expression in at least 4 samples, the number of genes does not change."
## Time difference of 22.79318 secs

## [1] "prop of non-null for rd: 0.00e+00"
## [1] "rerun DESeq2 without read depth"

## [1] "DE group CBO_day_3_4 under KO gene LHX5:"
## [1] "number of DE genes from knock out strategy CE: 10"
## [1] "number of DE genes from knock out strategy KO: 7"
## [1] "number of DE genes from knock out strategy PTC: 11"
## [1] "number of common DE genes by knock out strategies CE and KO: 5"
## [1] "number of common DE genes by knock out strategies KO and PTC: 7"
## [1] "number of common DE genes by knock out strategies PTC and CE: 6"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 18745  4497 
## 
## FALSE  TRUE 
## 18745  4497 
## [1] "==================================="
## [1] "day group CBO_day_6 DESeq2 results:"
## [1] "==================================="
## [1] "After filtering by gene expression q75, the number of genes reduces from 24202 to 23818"
## [1] "After filtering by nonzero expression in at least 4 samples, the number of genes does not change."
## Time difference of 42.57808 secs

## [1] "prop of non-null for rd: 2.13e-01"

## [1] "DE group CBO_day_6 under KO gene LHX5:"
## [1] "number of DE genes from knock out strategy CE: 4"
## [1] "number of DE genes from knock out strategy KO: 0"
## [1] "number of DE genes from knock out strategy PTC: 0"
## [1] "number of common DE genes by knock out strategies CE and KO: 0"
## [1] "number of common DE genes by knock out strategies KO and PTC: 0"
## [1] "number of common DE genes by knock out strategies PTC and CE: 0"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 18948  4870 
## 
## FALSE  TRUE 
## 18948  4870

## [1] "DE group CBO_day_6 under KO gene FEZF2:"
## [1] "number of DE genes from knock out strategy CE: 3"
## [1] "number of DE genes from knock out strategy KO: 1"
## [1] "number of DE genes from knock out strategy PTC: 1"
## [1] "number of common DE genes by knock out strategies CE and KO: 0"
## [1] "number of common DE genes by knock out strategies KO and PTC: 0"
## [1] "number of common DE genes by knock out strategies PTC and CE: 0"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 18948  4870 
## 
## FALSE  TRUE 
## 18948  4870

## [1] "DE group CBO_day_6 under KO gene RFX4:"
## [1] "number of DE genes from knock out strategy CE: 1"
## [1] "number of DE genes from knock out strategy KO: 6"
## [1] "number of DE genes from knock out strategy PTC: 2"
## [1] "number of common DE genes by knock out strategies CE and KO: 0"
## [1] "number of common DE genes by knock out strategies KO and PTC: 0"
## [1] "number of common DE genes by knock out strategies PTC and CE: 1"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 18948  4870 
## 
## FALSE  TRUE 
## 18948  4870 
## [1] "==================================="
## [1] "day group CBO_day_7 DESeq2 results:"
## [1] "==================================="
## [1] "After filtering by gene expression q75, the number of genes reduces from 24202 to 23665"
## [1] "After filtering by nonzero expression in at least 4 samples, the number of genes does not change."
## Time difference of 1.517885 mins

## [1] "prop of non-null for rd: 0.00e+00"
## [1] "rerun DESeq2 without read depth"

## [1] "DE group CBO_day_7 under KO gene FEZF2:"
## [1] "number of DE genes from knock out strategy CE: 1"
## [1] "number of DE genes from knock out strategy KO: 3"
## [1] "number of DE genes from knock out strategy PTC: 0"
## [1] "number of common DE genes by knock out strategies CE and KO: 0"
## [1] "number of common DE genes by knock out strategies KO and PTC: 0"
## [1] "number of common DE genes by knock out strategies PTC and CE: 0"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 18814  4851 
## 
## FALSE  TRUE 
## 18814  4851

## [1] "DE group CBO_day_7 under KO gene PAX6:"
## [1] "number of DE genes from knock out strategy CE: 0"
## [1] "number of DE genes from knock out strategy KO: 0"
## [1] "number of DE genes from knock out strategy PTC: 0"
## [1] "number of common DE genes by knock out strategies CE and KO: 0"
## [1] "number of common DE genes by knock out strategies KO and PTC: 0"
## [1] "number of common DE genes by knock out strategies PTC and CE: 0"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 18814  4851 
## 
## FALSE  TRUE 
## 18814  4851

## [1] "DE group CBO_day_7 under KO gene RFX4:"
## [1] "number of DE genes from knock out strategy CE: 0"
## [1] "number of DE genes from knock out strategy KO: 0"
## [1] "number of DE genes from knock out strategy PTC: 0"
## [1] "number of common DE genes by knock out strategies CE and KO: 0"
## [1] "number of common DE genes by knock out strategies KO and PTC: 0"
## [1] "number of common DE genes by knock out strategies PTC and CE: 0"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 18814  4851 
## 
## FALSE  TRUE 
## 18814  4851

## [1] "DE group CBO_day_7 under KO gene OTX1:"
## [1] "number of DE genes from knock out strategy CE: 2"
## [1] "number of DE genes from knock out strategy KO: 1"
## [1] "number of DE genes from knock out strategy PTC: 4"
## [1] "number of common DE genes by knock out strategies CE and KO: 1"
## [1] "number of common DE genes by knock out strategies KO and PTC: 1"
## [1] "number of common DE genes by knock out strategies PTC and CE: 2"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 18814  4851 
## 
## FALSE  TRUE 
## 18814  4851 
## [1] "==================================="
## [1] "day group CBO_day_8 DESeq2 results:"
## [1] "==================================="
## [1] "After filtering by gene expression q75, the number of genes reduces from 24202 to 23000"
## [1] "After filtering by nonzero expression in at least 4 samples, the number of genes does not change."
## Time difference of 27.23879 secs

## [1] "prop of non-null for rd: 0.00e+00"
## [1] "rerun DESeq2 without read depth"

## [1] "DE group CBO_day_8 under KO gene OTX1:"
## [1] "number of DE genes from knock out strategy CE: 84"
## [1] "number of DE genes from knock out strategy KO: 104"
## [1] "number of DE genes from knock out strategy PTC: 6"
## [1] "number of common DE genes by knock out strategies CE and KO: 12"
## [1] "number of common DE genes by knock out strategies KO and PTC: 3"
## [1] "number of common DE genes by knock out strategies PTC and CE: 5"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 18362  4638 
## 
## FALSE  TRUE 
## 18362  4638

## [1] "DE group CBO_day_8 under KO gene FEZF2:"
## [1] "number of DE genes from knock out strategy CE: 0"
## [1] "number of DE genes from knock out strategy KO: 5"
## [1] "number of DE genes from knock out strategy PTC: 0"
## [1] "number of common DE genes by knock out strategies CE and KO: 0"
## [1] "number of common DE genes by knock out strategies KO and PTC: 0"
## [1] "number of common DE genes by knock out strategies PTC and CE: 0"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 18362  4638 
## 
## FALSE  TRUE 
## 18362  4638 
## [1] "==================================="
## [1] "day group CBO_day_9 DESeq2 results:"
## [1] "==================================="
## [1] "After filtering by gene expression q75, the number of genes reduces from 24202 to 22923"
## [1] "After filtering by nonzero expression in at least 4 samples, the number of genes does not change."
## Time difference of 19.29888 secs

## [1] "prop of non-null for rd: 2.39e-01"

## [1] "DE group CBO_day_9 under KO gene RFX4:"
## [1] "number of DE genes from knock out strategy CE: 37"
## [1] "number of DE genes from knock out strategy KO: 56"
## [1] "number of DE genes from knock out strategy PTC: 33"
## [1] "number of common DE genes by knock out strategies CE and KO: 19"
## [1] "number of common DE genes by knock out strategies KO and PTC: 22"
## [1] "number of common DE genes by knock out strategies PTC and CE: 14"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 18321  4602 
## 
## FALSE  TRUE 
## 18321  4602

## [1] "DE group CBO_day_9 under KO gene OTX1:"
## [1] "number of DE genes from knock out strategy CE: 37"
## [1] "number of DE genes from knock out strategy KO: 40"
## [1] "number of DE genes from knock out strategy PTC: 44"
## [1] "number of common DE genes by knock out strategies CE and KO: 15"
## [1] "number of common DE genes by knock out strategies KO and PTC: 17"
## [1] "number of common DE genes by knock out strategies PTC and CE: 22"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 18321  4602 
## 
## FALSE  TRUE 
## 18321  4602

Save the size dataframe out

colnames(df_size) = c("DE_group", "knockout_gene_strategy", "n_ko", "n_WT")
df_size = as.data.frame(df_size)

multi_rows = which(table(df_size$knockout_gene_strategy)>1)
multi_rows
##  FEZF2_CE  FEZF2_KO FEZF2_PTC   LHX5_CE   LHX5_KO  LHX5_PTC   OTX1_CE   OTX1_KO 
##         1         2         3         4         5         6         7         8 
##  OTX1_PTC   RFX4_CE   RFX4_KO  RFX4_PTC 
##         9        13        14        15
df_size[which(df_size$knockout_gene_strategy%in%names(multi_rows)),]
##                                          DE_group knockout_gene_strategy n_ko
## 1                                     CBO_day_3_4                LHX5_CE    6
## 2                                     CBO_day_3_4                LHX5_KO    6
## 3                                     CBO_day_3_4               LHX5_PTC    6
## 4                                       CBO_day_6                LHX5_CE    6
## 5                                       CBO_day_6                LHX5_KO    6
## 6                                       CBO_day_6               LHX5_PTC    6
## 7                                       CBO_day_6               FEZF2_CE    3
## 8                                       CBO_day_6               FEZF2_KO    3
## 9                                       CBO_day_6              FEZF2_PTC    3
## 10                                      CBO_day_6                RFX4_CE    3
## 11                                      CBO_day_6                RFX4_KO    3
## 12                                      CBO_day_6               RFX4_PTC    3
## 13                                      CBO_day_7               FEZF2_CE    3
## 14                                      CBO_day_7               FEZF2_KO    3
## 15                                      CBO_day_7              FEZF2_PTC    3
## 19                                      CBO_day_7                RFX4_CE    3
## 20                                      CBO_day_7                RFX4_KO    3
## 21                                      CBO_day_7               RFX4_PTC    3
## 22                                      CBO_day_7                OTX1_CE    3
## 23                                      CBO_day_7                OTX1_KO    3
## 24                                      CBO_day_7               OTX1_PTC    3
## 25 CBO_day_8_excluding_20230424_23-scbct-028-run3                OTX1_CE    3
## 26 CBO_day_8_excluding_20230424_23-scbct-028-run3                OTX1_KO    3
## 27 CBO_day_8_excluding_20230424_23-scbct-028-run3               OTX1_PTC    3
## 28 CBO_day_8_excluding_20230424_23-scbct-028-run3               FEZF2_CE    3
## 29 CBO_day_8_excluding_20230424_23-scbct-028-run3               FEZF2_KO    3
## 30 CBO_day_8_excluding_20230424_23-scbct-028-run3              FEZF2_PTC    3
## 31                                      CBO_day_9                RFX4_CE    3
## 32                                      CBO_day_9                RFX4_KO    3
## 33                                      CBO_day_9               RFX4_PTC    3
## 34                                      CBO_day_9                OTX1_CE    3
## 35                                      CBO_day_9                OTX1_KO    3
## 36                                      CBO_day_9               OTX1_PTC    3
##    n_WT
## 1     2
## 2     2
## 3     2
## 4     6
## 5     6
## 6     6
## 7     6
## 8     6
## 9     6
## 10    6
## 11    6
## 12    6
## 13    4
## 14    4
## 15    4
## 19    4
## 20    4
## 21    4
## 22    4
## 23    4
## 24    4
## 25    4
## 26    4
## 27    4
## 28    4
## 29    4
## 30    4
## 31    2
## 32    2
## 33    2
## 34    2
## 35    2
## 36    2
fwrite(df_size, 
       sprintf("%s/%s_DE_n_samples.tsv", output_dir, release_name), 
       sep="\t")
gc()
##            used  (Mb) gc trigger  (Mb) limit (Mb)  max used  (Mb)
## Ncells  8140201 434.8   12621732 674.1         NA  12621732 674.1
## Vcells 40221377 306.9  107201476 817.9      65536 107201197 817.9
sessionInfo()
## R version 4.2.3 (2023-03-15)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur ... 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] grid      stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] readxl_1.4.3                UpSetR_1.4.0               
##  [3] ComplexHeatmap_2.14.0       data.table_1.15.4          
##  [5] dplyr_1.1.4                 ggpointdensity_0.1.0       
##  [7] viridis_0.6.5               viridisLite_0.4.2          
##  [9] DESeq2_1.38.3               SummarizedExperiment_1.28.0
## [11] Biobase_2.58.0              MatrixGenerics_1.10.0      
## [13] matrixStats_1.3.0           GenomicRanges_1.50.2       
## [15] GenomeInfoDb_1.34.9         IRanges_2.32.0             
## [17] S4Vectors_0.36.2            BiocGenerics_0.44.0        
## [19] RColorBrewer_1.1-3          MASS_7.3-60                
## [21] stringr_1.5.1               ggpubr_0.6.0               
## [23] ggrepel_0.9.5               ggplot2_3.5.1              
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-8           bit64_4.0.5            doParallel_1.0.17     
##  [4] httr_1.4.7             tools_4.2.3            backports_1.5.0       
##  [7] bslib_0.8.0            utf8_1.2.4             R6_2.5.1              
## [10] DBI_1.2.3              colorspace_2.1-1       GetoptLong_1.0.5      
## [13] withr_3.0.1            tidyselect_1.2.1       gridExtra_2.3         
## [16] bit_4.0.5              compiler_4.2.3         cli_3.6.3             
## [19] Cairo_1.6-2            DelayedArray_0.24.0    labeling_0.4.3        
## [22] sass_0.4.9             scales_1.3.0           digest_0.6.37         
## [25] rmarkdown_2.28         XVector_0.38.0         pkgconfig_2.0.3       
## [28] htmltools_0.5.8.1      highr_0.11             fastmap_1.2.0         
## [31] GlobalOptions_0.1.2    rlang_1.1.4            rstudioapi_0.16.0     
## [34] RSQLite_2.3.7          farver_2.1.2           shape_1.4.6.1         
## [37] jquerylib_0.1.4        generics_0.1.3         jsonlite_1.8.8        
## [40] BiocParallel_1.32.6    car_3.1-2              RCurl_1.98-1.16       
## [43] magrittr_2.0.3         GenomeInfoDbData_1.2.9 Matrix_1.5-4.1        
## [46] Rcpp_1.0.13            munsell_0.5.1          fansi_1.0.6           
## [49] abind_1.4-5            lifecycle_1.0.4        stringi_1.8.4         
## [52] yaml_2.3.10            carData_3.0-5          zlibbioc_1.44.0       
## [55] plyr_1.8.9             blob_1.2.4             parallel_4.2.3        
## [58] crayon_1.5.3           lattice_0.22-6         Biostrings_2.66.0     
## [61] annotate_1.76.0        circlize_0.4.16        KEGGREST_1.38.0       
## [64] locfit_1.5-9.10        knitr_1.48             pillar_1.9.0          
## [67] rjson_0.2.21           ggsignif_0.6.4         geneplotter_1.76.0    
## [70] codetools_0.2-20       XML_3.99-0.17          glue_1.7.0            
## [73] evaluate_0.24.0        foreach_1.5.2          vctrs_0.6.5           
## [76] png_0.1-8              cellranger_1.1.0       gtable_0.3.5          
## [79] purrr_1.0.2            tidyr_1.3.1            clue_0.3-65           
## [82] cachem_1.1.0           xfun_0.47              xtable_1.8-4          
## [85] broom_1.0.6            rstatix_0.7.2          tibble_3.2.1          
## [88] iterators_1.0.14       AnnotationDbi_1.60.2   memoise_2.0.1         
## [91] cluster_2.1.6